google.mobility <- read.csv('Data/Global_Mobility_Report.csv') %>%
filter(country_region == 'United States')
confirmed.cases.data <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
# load packages
library(tidyr)
library(DataCombine)
# clean data
confirmed.cases.data2 <- confirmed.cases.data %>%
dplyr::rename(state = Province_State,
county = Admin2) %>%
dplyr::select(-UID, -iso2, -iso3, -code3,
-Lat, -Long_, -Combined_Key, -Country_Region) %>%
gather(date, confirmed.cases,-state,-county,-FIPS) %>%
dplyr::mutate(date = gsub('X', '', date),
date = as.Date(as.character(date), "%m.%d.%y"),
county_state = paste0(county, ', ', state)) %>%
arrange(county_state, date) %>%
dplyr::mutate(lag = slide(.,
Var = 'confirmed.cases',
NewVar = 'new',
GroupVar = 'county_state',
slideBy = -1)[,'new'],
new.cases = confirmed.cases - lag)
##
## Remember to order . by county_state and the time variable before running.
##
## Lagging confirmed.cases by 1 time units.
## calculate rolling averages and remove non-states
covid.smooth.data_county <- confirmed.cases.data2 %>%
dplyr::group_by(county_state, date) %>%
dplyr::summarise(new.cases = sum(new.cases)) %>%
dplyr::group_by(county_state) %>%
dplyr::mutate(cases_14days = zoo::rollmean(new.cases,
k = 14, fill = 0)) %>%
dplyr::ungroup() %>%
mutate() %>%
merge(confirmed.cases.data2 %>%
dplyr::select(county_state, date, county, state)) %>%
filter(!state %in% c('American Samoa',
'Diamond Princess',
'Grand Princess',
'Guam',
'Northern Mariana Islands',
'Puerto Rico',
'Virgin Islands'))
## `summarise()` has grouped output by 'county_state'. You can override using the
## `.groups` argument.
load("Data/county.data.RData")
# patterns for cleaning county names
patterns <- c(' Borough| Census Area| County| Parish| city')
climate.data <- county.data %>%
# grab file with full state names
left_join(read.csv('Data/fips concordance.csv') %>%
dplyr::select(State = Alpha.code,
state_full = Name)) %>%
dplyr::mutate(county = gsub(patterns, '', Area_Name),
county_state = paste0(county, ', ', state_full),
state = state_full) %>%
dplyr::select(county,
county_state,
state,
avg.dec.temp = Dec.Temp.AVG...F,
avg.jan.temp = Jan.Temp.AVG...F,
avg.feb.temp = Feb.Temp.AVG...F,
avg.mar.temp = Mar.Temp.AVG...F,
avg.apr.temp = Apr.Temp.AVG...F,
avg.may.temp = May.Temp.AVG...F,
avg.jun.temp = Jun.Temp.AVG...F,
avg.jul.temp = Jul.Temp.AVG...F,
avg.aug.temp = Aug.Temp.AVG...F,
avg.sep.temp = Sep.Temp.AVG...F,
avg.oct.temp = Oct.Temp.AVG...F,
avg.nov.temp = Nov.Temp.AVG...F)
## Joining with `by = join_by(State)`
mobility.data <- google.mobility %>%
dplyr::mutate(county = gsub(patterns, '', sub_region_2),
state = sub_region_1,
date = as.Date(date)) %>%
dplyr::select(county,
state,
date,
mobility.transit = transit_stations_percent_change_from_baseline,
mobility.workplaces = workplaces_percent_change_from_baseline,
mobility.grocery = grocery_and_pharmacy_percent_change_from_baseline)
mobility.data <- na.omit(mobility.data)
blank_counts <- sapply(mobility.data, function(col) sum(col == ""))
print(blank_counts)
## county state date mobility.transit
## 50513 974 NA 0
## mobility.workplaces mobility.grocery
## 0 0
clean.mobility.data <- mobility.data %>% filter(county != "")
clean.mobility.data <- clean.mobility.data %>% filter(state != "")
# patterns for cleaning county names
patterns <- c(' Borough| Census Area| County| Parish| city')
transit.data <- county.data %>%
# grab file with full state names
left_join(read.csv('Data/fips concordance.csv') %>%
dplyr::select(State = Alpha.code,
state_full = Name)) %>%
dplyr::mutate(county = gsub(patterns, '', Area_Name),
county_state = paste0(county, ', ', state_full),
state = state_full) %>%
dplyr::select(county,
county_state,
state,
transit.scores = transit_scores...population.weighted.averages.aggregated.from.town.city.level.to.county)
## Joining with `by = join_by(State)`
# patterns for cleaning county names
patterns <- c(' Borough| Census Area| County| Parish| city')
# covid case data
load("Data/covid.RData")
covid.data <- covid.smooth.data_county %>%
# county data
left_join(county.data %>%
# grab file with full state names
left_join(read.csv('Data/fips concordance.csv') %>%
dplyr::select(State = Alpha.code,
state_full = Name)) %>%
dplyr::mutate(county = gsub(patterns, '', Area_Name),
county_state = paste0(county, ', ', state_full)) %>%
dplyr::select(county_state,
FIPS = FIPS,
total.population = POP_ESTIMATE_2018,
pop.density = Density.per.square.mile.of.land.area...Population,
age.65.plus = Total_age65plus,
avg.dec.temp = Dec.Temp.AVG...F,
avg.jan.temp = Jan.Temp.AVG...F,
avg.feb.temp = Feb.Temp.AVG...F,
avg.mar.temp = Mar.Temp.AVG...F,
avg.apr.temp = Apr.Temp.AVG...F,
avg.may.temp = May.Temp.AVG...F,
avg.jun.temp = Jun.Temp.AVG...F,
avg.jul.temp = Jul.Temp.AVG...F,
avg.aug.temp = Aug.Temp.AVG...F,
avg.sep.temp = Sep.Temp.AVG...F,
avg.oct.temp = Oct.Temp.AVG...F,
avg.nov.temp = Nov.Temp.AVG...F,
icu.beds = ICU.Beds,
active.physicians = Active.Physicians.per.100000.Population.2018..AAMC.,
active.patient.care.physicians = Total.Active.Patient.Care.Physicians.per.100000.Population.2018..AAMC.,
housing.density = Density.per.square.mile.of.land.area...Housing.units,
transit.scores = transit_scores...population.weighted.averages.aggregated.from.town.city.level.to.county)) %>%
merge(google.mobility %>%
dplyr::mutate(county = gsub(patterns, '', sub_region_2),
county_state = paste0(county, ', ', sub_region_1),
date = as.Date(date),
mobility.transit = ifelse(is.na(transit_stations_percent_change_from_baseline), 0,
transit_stations_percent_change_from_baseline)) %>%
dplyr::select(county_state,
date,
mobility.transit,
mobility.workplaces = workplaces_percent_change_from_baseline,
mobility.grocery = grocery_and_pharmacy_percent_change_from_baseline)) %>%
# convert to rates
mutate(new.cases = new.cases / (total.population / 100000),
# icu.beds = icu.beds / (total.population / 100000),
age.65.plus = age.65.plus / total.population)
## Joining with `by = join_by(State)`
## Joining with `by = join_by(county_state)`
# avg.winter.temp = (avg.dec.temp + avg.jan.temp + avg.feb.temp) / 3)
# lag new cases
new.cases.rate.lagged <- DataCombine::slide(covid.data,
Var = 'new.cases',
NewVar = 'new.cases.lag',
GroupVar = 'county_state',
slideBy = -1)
##
## Remember to order covid.data by county_state and the time variable before running.
##
## Lagging new.cases by 1 time units.
# lag workplace mobility
workplace.lag14 <- DataCombine::slide(covid.data,
Var = 'mobility.workplaces',
NewVar = 'workplaces.lag14',
GroupVar = 'county_state',
slideBy = -14)
##
## Remember to order covid.data by county_state and the time variable before running.
##
## Lagging mobility.workplaces by 14 time units.
##
##
## Warning: the following groups have 13 or fewer observations.
## No valid lag/lead can be created, so they are dropped:
##
## Barber, Kansas
## Carter, Missouri
## Coke, Texas
## Day, South Dakota
## East Carroll, Louisiana
## Florence, Wisconsin
## Furnas, Nebraska
## Gentry, Missouri
## Gray, Kansas
## Hot Springs, Wyoming
## Kiowa, Colorado
## Lafayette, Arkansas
## Lake of the Woods, Minnesota
## Luce, Michigan
## Lyman, South Dakota
## Nome, Alaska
## Ontonagon, Michigan
## Ouray, Colorado
## Pondera, Montana
## Real, Texas
## Scotland, Missouri
## Scott, Illinois
## Stevens, Kansas
## Tripp, South Dakota
## Washington, Colorado
## Wheeler, Texas
## Wirt, West Virginia
## Wolfe, Kentucky
# combine
covid.data <- covid.data %>%
merge(new.cases.rate.lagged %>%
dplyr::select(county_state, date,
new.cases.lag)) %>%
merge(workplace.lag14 %>%
dplyr::select(county_state, date,
workplaces.lag14))
#covid.data <- covid.data %>%
# merge(confirmed.cases.data %>%
# dplyr::rename(state = Province_State,
# county = Admin2) %>%
# dplyr::select(-Lat, -Long_))
colnames(covid.data)
## [1] "county_state" "date"
## [3] "new.cases" "cases_14days"
## [5] "county" "state"
## [7] "FIPS" "total.population"
## [9] "pop.density" "age.65.plus"
## [11] "avg.dec.temp" "avg.jan.temp"
## [13] "avg.feb.temp" "avg.mar.temp"
## [15] "avg.apr.temp" "avg.may.temp"
## [17] "avg.jun.temp" "avg.jul.temp"
## [19] "avg.aug.temp" "avg.sep.temp"
## [21] "avg.oct.temp" "avg.nov.temp"
## [23] "icu.beds" "active.physicians"
## [25] "active.patient.care.physicians" "housing.density"
## [27] "transit.scores" "mobility.transit"
## [29] "mobility.workplaces" "mobility.grocery"
## [31] "new.cases.lag" "workplaces.lag14"
Add shape files for any mapping tasks later
us.shape.1 <-
st_read("Data/US shapefiles/cb_2016_us_county_20m.shp")
## Reading layer `cb_2016_us_county_20m' from data source
## `C:\Users\emily\Documents\GitHub\DATA4207-GroupProject\group-project-3\Data\US shapefiles\cb_2016_us_county_20m.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 3220 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1743 ymin: 17.91377 xmax: 179.7739 ymax: 71.35256
## Geodetic CRS: NAD83
us.shape.1 <- us.shape.1 %>%
shift_geometry() #This function moves alaska into a easier area to interpret
fip.concordance <- read.csv("Data/fips concordance.csv")
fip.concordance <- read.csv("Data/fips concordance.csv")
us.shape.2 <- us.shape.1 %>%
merge(fip.concordance %>%
mutate(STATEFP = ifelse(Numeric.code >= 0 & Numeric.code <= 9,
paste0(0 , Numeric.code), Numeric.code)) %>%
dplyr::rename(state = Alpha.code))
us.shape.2$county <- paste0(us.shape.2$NAME, ", ", us.shape.2$state)
us.shape.2$county <- gsub("Doña Ana, NM", "Dona Ana, NM", us.shape.2$county)
us.shape.2$county <- gsub("LaSalle, LA", "La Salle, LA", us.shape.2$county)
us.shape.2$county <- gsub("Oglala Lakota, SD", "Oglala, SD", us.shape.2$county)
# Summary statistics for age over 65
summary(covid.data$age.65.plus)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.048 0.156 0.181 0.184 0.208 0.576 23767
# Correlation between age over 65 and new cases
cor(covid.data$age.65.plus, covid.data$new.cases, use = "complete.obs")
## [1] -0.03972087
# Scatter plot
plot(covid.data$new.cases, covid.data$age.65.plus, ylab = "Age Over 65", xlab = "New COVID-19 Cases", main = "Age Over 65 vs. New COVID-19 Cases")
specific_date <- as.Date("2020-07-20 ")
filtered_data <- covid.data %>%
filter(date == specific_date)
# Summary statistics for population density
summary(covid.data$pop.density)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.1 32.4 72.4 650.6 274.5 69468.4 2486
# Correlation between population density and new cases
cor(covid.data$pop.density, covid.data$new.cases, use = "complete.obs")
## [1] 0.04094492
# Scatter plot
plot(filtered_data$new.cases, filtered_data$pop.density, ylab = "Population Density", xlab = "New COVID-19 Cases", main = "Population Density vs. New COVID-19 Cases")
# Summary statistics for active physicians per 100,000
summary(covid.data$active.patient.care.physicians)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 174.8 202.9 231.8 229.9 240.7 353.7 1919
# Correlation between active physicians and new cases
cor(covid.data$active.patient.care.physicians, covid.data$new.cases, use = "complete.obs")
## [1] -0.002778664
# Scatter plot
plot(covid.data$new.cases, covid.data$active.patient.care.physicians, ylab = "Active Physicians per 100,000", xlab = "New COVID-19 Cases", main = "Physicians vs. New COVID-19 Cases")
na_count_per_column <- transit.data %>%
summarise_all(~ sum(is.na(.)))
print(na_count_per_column)
## county county_state state transit.scores
## 1 0 0 1 212
clean.transit.data <- na.omit(transit.data)
# standardise
clean.transit.data <- clean.transit.data %>%
mutate(z.transit = (transit.scores - mean(transit.scores, na.rm=T)) /
sd(transit.scores, na.rm=T))
ggplot(clean.transit.data, aes(x = z.transit)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Standardised transit scores in America",
x = "Transit Scores",
y = "Frequency") +
bbplot::bbc_style()+
theme(plot.title = element_text(size = 12),
axis.title.x = element_text(size = 8), # Adjust the x-axis label size
axis.title.y = element_text(size = 8))
us.shape.2 <- us.shape.2 %>%
select(-county, -Name, -Status, -COUNTYFP, -COUNTYNS, -STATEFP, -AFFGEOID, -LSAD, -ALAND, -AWATER, -Numeric.code, -Status)%>%
rename(county = NAME)
transit.map <- full_join(us.shape.2,
clean.transit.data %>%
select(county, state, z.transit),
by = "county") %>%
st_as_sf()
ggplot(transit.map) +
geom_sf(mapping = aes(fill = z.transit), colour = NA, inherit.aes = FALSE) +
scale_fill_gradient2(
low = "#2c7bb6", # Blue
mid = "#ffffbf", # Yellow (midpoint)
high = "#d7191c", # Red
midpoint = mean(transit.map$z.transit, na.rm = TRUE),
limits = c(min(transit.map$z.transit, na.rm = TRUE),
max(transit.map$z.transit, na.rm = TRUE)),
label = scales::comma,
na.value = "grey"
) +
labs(title = 'Standardised Transit Distribution') +
guides(fill = guide_legend(keyheight = 0.3)) +
theme_void() +
bbplot::bbc_style() +
theme(
plot.title = element_text(size = 16), # Make the title larger
legend.position = 'bottom',
legend.title = element_blank(),
legend.text = element_text(size = 10), # Make legend text smaller
axis.title = element_text(size = 10), # Make axis labels smaller
axis.text = element_text(size = 8) # Make axis text smaller
)
# install.packages("zoo")
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
summarise.mobility.data <- clean.mobility.data %>%
group_by(date) %>%
summarise(mobility.transit = mean(mobility.transit, na.rm = TRUE),
mobility.grocery = mean(mobility.grocery, na.rm = TRUE),
mobility.workplaces = mean(mobility.workplaces, na.rm = TRUE))
# Mobility Values: Each value in the dataset represents a percentage change from a baseline.
# Rolling Average: The 7-day rolling average smooths these daily percentage changes to show a trend over time.
# Percentage Representation: The rolling average remains a percentage because it’s derived from percentage values.
# Calculate rolling 7 day average
grocery.mobility.data <- summarise.mobility.data %>%
arrange(date) %>%
mutate(rolling_avg = zoo::rollmean(mobility.grocery, 7, fill = NA, align = "right"))
transit.mobility.data <- summarise.mobility.data %>%
arrange(date) %>%
mutate(rolling_avg = zoo::rollmean(mobility.transit, 7, fill = NA, align = "right"))
workplace.mobility.data <- summarise.mobility.data %>%
arrange(date) %>%
mutate(rolling_avg = zoo::rollmean(mobility.workplaces, 7, fill = NA, align = "right"))
ggplot(grocery.mobility.data, aes(x = date, y = rolling_avg)) +
geom_line(color = "blue") +
labs(title = "Changes in Visitors to Grocery and Pharmacy Stores Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)") +
bbplot::bbc_style()+
theme(plot.title = element_text(size = 12),
axis.title.x = element_text(size = 8), # Adjust the x-axis label size
axis.title.y = element_text(size = 8))
ggplot(grocery.mobility.data, aes(x = date, y = rolling_avg)) +
geom_line(color = "blue", size = 1) + # Thicker line for better visibility
# geom_point(color = "red", size = 1, alpha = 0.7) + # Adding points for data
labs(title = "Changes in Visitors to Grocery and Pharmacy Stores Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)") +
bbplot::bbc_style() +
theme(
plot.title = element_text(size = 16, face = "bold"), # Larger, bold title
axis.title.x = element_text(size = 12), # Larger x-axis label
axis.title.y = element_text(size = 12), # Larger y-axis label
axis.text.x = element_text(size = 10, angle = 45, hjust = 1), # Rotate x-axis labels for better readability
axis.text.y = element_text(size = 10)
) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") + # Format x-axis dates
geom_vline(xintercept = as.numeric(as.Date(c("2020-03-11", "2020-12-25"))), # Add vertical lines for significant dates
linetype = "dashed", color = "grey50") +
annotate("text", x = as.Date("2020-03-11"), y = max(grocery.mobility.data$rolling_avg, na.rm = TRUE)* -0.5,
label = "Pandemic Declared", angle = 90, vjust = -0.5, size = 3, color = "grey50") +
annotate("text", x = as.Date("2020-12-25"), y = max(grocery.mobility.data$rolling_avg, na.rm = TRUE)* 0.8,
label = "Christmas", angle = 90, vjust = -0.5, size = 3, color = "grey50")
ggplot(transit.mobility.data, aes(x = date, y = rolling_avg)) +
geom_line(color = "blue") +
labs(title = "Changes in Visitors to Transit Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)") +
theme_minimal()
ggplot(workplace.mobility.data, aes(x = date, y = rolling_avg)) +
geom_line(color = "blue") +
labs(title = "Changes in Visitors to Workplaces Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)") +
theme_minimal()
# Add a type column to each dataframe
grocery.mobility.data <- grocery.mobility.data %>%
mutate(type = "Grocery and Pharmacy")
transit.mobility.data <- transit.mobility.data %>%
mutate(type = "Transit")
workplace.mobility.data <- workplace.mobility.data %>%
mutate(type = "Workplace")
# Combine the dataframes into one
combined_mobility_data <- bind_rows(grocery.mobility.data, transit.mobility.data, workplace.mobility.data)
# Create the ggplot
p <- ggplot(combined_mobility_data, aes(x = date, y = rolling_avg, color = type)) +
geom_line() +
labs(title = "Changes in Mobility Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)",
color = "Mobility Type") +
theme_minimal()
# Convert the ggplot to an interactive plotly plot
interactive_plot <- ggplotly(p)
# Display the interactive plot
interactive_plot
# Changing the climate data from a wide format to long format:
climate.data <- climate.data %>%
dplyr::select(county,
state,
avg.temp.jan = avg.jan.temp,
avg.temp.feb = avg.feb.temp,
avg.temp.mar = avg.mar.temp,
avg.temp.apr = avg.apr.temp,
avg.temp.may = avg.may.temp,
avg.temp.jun = avg.jun.temp,
avg.temp.jul = avg.jul.temp,
avg.temp.aug = avg.aug.temp,
avg.temp.sep = avg.sep.temp,
avg.temp.oct = avg.oct.temp,
avg.temp.nov = avg.nov.temp,
avg.temp.dec = avg.dec.temp)
climate.data.long <- climate.data %>%
pivot_longer(
cols = starts_with("avg.temp."),
names_to = "month",
values_to = "temp"
)
climate.data.long <- climate.data.long %>%
mutate(month = case_when(
month == "avg.temp.jan" ~ "Jan",
month == "avg.temp.feb" ~ "Feb",
month == "avg.temp.mar" ~ "Mar",
month == "avg.temp.apr" ~ "Apr",
month == "avg.temp.may" ~ "May",
month == "avg.temp.jun" ~ "Jun",
month == "avg.temp.jul" ~ "Jul",
month == "avg.temp.aug" ~ "Aug",
month == "avg.temp.sep" ~ "Sep",
month == "avg.temp.oct" ~ "Oct",
month == "avg.temp.nov" ~ "Nov",
month == "avg.temp.dec" ~ "Dec",
TRUE ~ month
))
# us.shape.2 <- us.shape.2 %>%
# select(-county, -Name, -Status, -COUNTYFP, -COUNTYNS, -STATEFP, -AFFGEOID, -GEOID, -LSAD, -ALAND, -AWATER, -Numeric.code, -Status)%>%
# rename(county = NAME)
# Don't want to include missing data since its not useful - hence inner join
climate.map <- full_join(us.shape.2,
climate.data.long %>%
select(county, state, month, temp),
by = "county") %>%
st_as_sf()
# To acknolwedge its a shape file
climate.state <- climate.data.long %>%
group_by(state, month)%>%
summarise(avg_temp = mean(temp, na.rm= TRUE, .groups = 'drop'))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
ggplot(climate.state, aes(x = month, y = avg_temp, group = state, color = state)) +
geom_line() +
geom_point() +
labs(title = "Average Temperature Over Each Month Per State",
x = "Month",
y = "Average Temperature") +
theme_minimal() +
theme(
plot.title = element_text(size = 12), # Set title size
axis.title.x = element_text(size = 10), # Set x-axis label size
axis.title.y = element_text(size = 10), # Set y-axis label size
legend.position = "bottom", # Move legend below plot
legend.title = element_blank(), # Remove legend title
legend.text = element_text(size = 8) # Set legend text size
) +
guides(color = guide_legend(nrow = 5)) # Set legend to one row
# Create the plot using ggplot
p <- ggplot(climate.state, aes(x = month, y = avg_temp, group = state, color = state)) +
geom_line() +
geom_point() +
labs(title = "Average Temperature Over Each Month Per State",
x = "Month",
y = "Average Temperature") +
theme_minimal()
# Convert the ggplot object to plotly
p <- ggplotly(p, tooltip = c("x", "y", "color"))
# Show the interactive plot
p
month_order <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# Convert the month variable to a factor with the desired order
climate.map$month <- factor(climate.map$month, levels = month_order)
ggplot(climate.map) +
geom_sf(mapping = aes(fill = temp), colour = NA, inherit.aes = FALSE) +
scale_fill_gradient2(low = "#0072B2",
high = "red",
midpoint = mean(climate.map$temp, na.rm = TRUE),
limits = c(min(climate.map$temp, na.rm = TRUE),
max(climate.map$temp, na.rm = TRUE)),
label = scales::comma,
na.value = "grey") +
labs(title = 'Average Climate distribution') +
guides(fill = guide_legend(keyheight = 0.3)) +
theme_void() +
theme(plot.title = element_text(size = 7),
legend.position = 'bottom',
legend.title = element_blank()) +
facet_wrap(~month) # This line tells ggplot to create separate panels for each unique value in the month column. Each panel will display the map for one month.
# Check column names and data types
# Ensure FIPS column is correctly named and of the same type in both data frames
covid.data <- covid.data %>%
mutate(FIPS = as.character(FIPS))
us.shape.2 <- us.shape.2 %>%
mutate(GEOID = as.character(GEOID))
# Function to perform join in chunks
join_in_chunks <- function(shape_data, covid_data, chunk_size = 1000) {
n <- nrow(shape_data)
result <- NULL
for (i in seq(1, n, by = chunk_size)) {
chunk <- shape_data[i:min(i + chunk_size - 1, n), ]
chunk_result <- chunk %>%
left_join(covid_data, by = c("GEOID" = "FIPS"))
if (is.null(result)) {
result <- chunk_result
} else {
result <- bind_rows(result, chunk_result)
}
}
return(result)
}
# Perform join in chunks
county_data <- join_in_chunks(us.shape.2, covid.data, chunk_size = 1000)
county_data <- st_transform(county_data, crs = st_crs(5070))
# Function to plot in chunks
county_data <- st_simplify(county_data, dTolerance = 1000)
# Function to plot in chunks
plot_in_chunks <- function(data, chunk_size = 1000) {
n <- nrow(data)
for (i in seq(1, n, by = chunk_size)) {
chunk <- data[i:min(i + chunk_size - 1, n), ]
# Plot Population Density
p<- ggplot(data = chunk) +
geom_sf(aes(fill = pop_density), color = NA) +
scale_fill_viridis_c(option = "viridis") +
theme_minimal() +
labs(title = paste("Population Density by County (Chunk", i, "to", min(i + chunk_size - 1, n), ")"),
fill = "Density (per sq. mile)")
p
}
}
# Call the function to plot in chunks
plot_in_chunks(county_data, chunk_size = 1000)
# Plot Age Over 65
ggplot(data = county_data) +
geom_sf(aes(fill = age.65.plus), color = NA) +
scale_fill_viridis_c(option = "viridis") +
theme_minimal() +
labs(title = "Proportion of Population Age 65+ by County",
fill = "Proportion")
# Plot Active Physicians per 100,000
ggplot(data = county_data) +
geom_sf(aes(fill = active.patient.care.physicians), color = NA) +
scale_fill_viridis_c(option = "viridis") +
theme_minimal() +
labs(title = "Active Physicians per 100,000 by County",
fill = "Physicians/100k")
“county_state” “date” “new.cases”
[4] “cases_14days” “county” “state”
[7] “FIPS” “total.population” “pop.density”
[10] “age.65.plus” “avg.dec.temp” “avg.jan.temp”
[13] “avg.feb.temp” “avg.mar.temp” “avg.apr.temp”
[16] “avg.may.temp” “avg.jun.temp” “avg.jul.temp”
[19] “avg.aug.temp” “avg.sep.temp” “avg.oct.temp”
[22] “avg.nov.temp” “icu.beds” “active.physicians”
[25] “active.patient.care.physicians” “housing.density”
“transit.scores”
[28] “mobility.transit” “mobility.workplaces” “mobility.grocery”
[31] “new.cases.lag” “workplaces.lag14”
model2 <- lm(cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp , covid.data)
summary(model2)
##
## Call:
## lm(formula = cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp +
## avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp +
## avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp +
## avg.dec.temp, data = covid.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -788 -56 -29 7 38704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -397.00403 4.06516 -97.66 <2e-16 ***
## avg.jan.temp 1.49125 0.11727 12.72 <2e-16 ***
## avg.feb.temp -7.73632 0.08672 -89.21 <2e-16 ***
## avg.mar.temp 6.24124 0.15992 39.03 <2e-16 ***
## avg.apr.temp 4.16294 0.17651 23.59 <2e-16 ***
## avg.may.temp -5.82414 0.11941 -48.77 <2e-16 ***
## avg.jun.temp -24.30396 0.18125 -134.09 <2e-16 ***
## avg.jul.temp 26.63539 0.18637 142.92 <2e-16 ***
## avg.aug.temp -8.48234 0.16586 -51.14 <2e-16 ***
## avg.sep.temp 2.30451 0.12589 18.31 <2e-16 ***
## avg.oct.temp 8.02431 0.11472 69.95 <2e-16 ***
## avg.nov.temp 9.33308 0.17684 52.78 <2e-16 ***
## avg.dec.temp -3.31985 0.12168 -27.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 240.6 on 2379966 degrees of freedom
## (10961 observations deleted due to missingness)
## Multiple R-squared: 0.03495, Adjusted R-squared: 0.03495
## F-statistic: 7183 on 12 and 2379966 DF, p-value: < 2.2e-16
model3 <- lm(cases_14days ~mobility.transit +mobility.grocery +mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp, covid.data)
summary(model3)
##
## Call:
## lm(formula = cases_14days ~ mobility.transit + mobility.grocery +
## mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp +
## avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp +
## avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp +
## avg.dec.temp, data = covid.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -799 -74 -30 18 38597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -561.34293 6.63602 -84.59 <2e-16 ***
## mobility.transit -1.86760 0.01204 -155.09 <2e-16 ***
## mobility.grocery -0.67687 0.01799 -37.63 <2e-16 ***
## mobility.workplaces -0.91827 0.01813 -50.65 <2e-16 ***
## avg.jan.temp 4.81217 0.20528 23.44 <2e-16 ***
## avg.feb.temp -10.46979 0.15244 -68.68 <2e-16 ***
## avg.mar.temp 8.94395 0.26505 33.74 <2e-16 ***
## avg.apr.temp -3.75114 0.30867 -12.15 <2e-16 ***
## avg.may.temp -6.13216 0.19875 -30.85 <2e-16 ***
## avg.jun.temp -24.58477 0.29336 -83.81 <2e-16 ***
## avg.jul.temp 31.89003 0.32071 99.44 <2e-16 ***
## avg.aug.temp -13.60139 0.30398 -44.74 <2e-16 ***
## avg.sep.temp 10.87912 0.22105 49.22 <2e-16 ***
## avg.oct.temp 4.75981 0.19643 24.23 <2e-16 ***
## avg.nov.temp 10.11356 0.31310 32.30 <2e-16 ***
## avg.dec.temp -2.65084 0.21474 -12.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 300 on 1443118 degrees of freedom
## (947806 observations deleted due to missingness)
## Multiple R-squared: 0.06917, Adjusted R-squared: 0.06916
## F-statistic: 7149 on 15 and 1443118 DF, p-value: < 2.2e-16
library(lmtest)
# Durbin-Watson test
dwtest(model3)
##
## Durbin-Watson test
##
## data: model3
## DW = 0.016454, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# Plot residuals vs fitted values
plot(fitted(model3), residuals(model3), main = "Residuals vs Fitted")
abline(h = 0, col = "red")
# Q-Q plot
qqnorm(residuals(model3))
qqline(residuals(model3), col = "red")
areas more than others?
COVID-19